library(reticulate)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidytext)
library(quanteda)
## Package version: 2.1.1
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
## 
##     View
library(ggplot2)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
## The following objects are masked from 'package:quanteda':
## 
##     meta, meta<-
## 
## Attaching package: 'tm'
## The following objects are masked from 'package:quanteda':
## 
##     as.DocumentTermMatrix, stopwords
library(topicmodels)
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.3     v purrr   0.3.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts -------------------------------------------------------------------- tidyverse_conflicts() --
## x NLP::annotate() masks ggplot2::annotate()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(wordcloud)
## Loading required package: RColorBrewer
library(gutenbergr)
library(textclean)
library(foreach)
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
library(parallel)
library(textstem)
## Loading required package: koRpus.lang.en
## Loading required package: koRpus
## Loading required package: sylly
## For information on available language packages for 'koRpus', run
## 
##   available.koRpus.lang()
## 
## and see ?install.koRpus.lang()
## 
## Attaching package: 'koRpus'
## The following object is masked from 'package:readr':
## 
##     tokenize
## The following objects are masked from 'package:quanteda':
## 
##     tokens, types
library(gmodels)
use_condaenv()
source_python("py_script.py")
source('Rfunctions.R')
news <- readRDS('data/BBC_Reuters_GoogleNews_articles_business.rds')  
article <- news$Body[30000]
article
## [1] "LOS ANGELES  (Reuters) - The union representing U.S. screenwriters called for a strike against film and TV studios starting on Monday in a move giving negotiators one last weekend to reach a contract deal or shatter 20 years of Hollywood labor peace. \n\n The strike deadline was issued on Friday, a day after a three-year contract covering the 12,000-member Writers Guild of America expired. It follows months of talks that deadlocked over the union's demands for a greater share of DVD and Internet revenues. Each side has accused the other of stonewalling and refusing to budge from unreasonable proposals. Union negotiators urged a walkout during a boisterous membership meeting on Thursday night, and the Writers Guild's governing board voted to ratify that recommendation. Hours later, a studio spokesman said the two sides had scheduled a meeting for 10 a.m. Sunday. Union leaders said at an afternoon news conference there still was time to avoid a strike that, if prolonged, could cost hundreds of millions of dollars in lost revenues and wages. \"We have 48 hours and what we really want to do is negotiate,\" said John Bowman, chairman of the union's negotiating committee. He said that while reluctant to go on strike, the Writers Guild felt it had to act decisively. \"We have to inflict as much damage as quickly as possible in order to get this thing over,\" Bowman said. The Alliance of Motion Picture and Television Producers, the bargaining arm of the studios, offered a statement by the group's president, Nick Counter, calling the Writers Guild's move toward a strike \"precipitous and irresponsible.\" \"Our goal continues to be to reach a fair and reasonable agreement that will keep the industry working,\" he said. Union officials said that barring a last-minute deal, the strike would begin at 3:01 a.m. EST and picket lines would go up in Los Angeles and New York City. $1 BILLION AT RISK The last major Hollywood strike was a Writers Guild walkout in 1988 that lasted 22 weeks, delayed the start of the fall TV season and cost the industry an estimated $500 million. Los Angeles economist Jack Kyser said a strike of the same duration now could result in at least $1 billion in economic losses. The union says the overall compensation package sought by writers would cost $220 million over three years, a fraction of the $24.4 billion in revenues generated by U.S. DVD sales and rentals last year alone, according to accounting firm PricewaterhouseCoopers. A writers' strike would be little noticed by movie and TV audiences at first. Film studios' screenplay pipeline is well-stocked through 2008. And producers of prime-time sitcoms and dramas are said to have stockpiled enough advance episodes to keep their shows on the air until January or February. But late-night talk shows will go off the air almost immediately since they rely on a daily supply of topical jokes. On his CBS show on Thursday, David Letterman described the producers as \"cowards, cutthroats and weasels.\" Prime-time schedules will start filling up with more reruns and game shows after the networks have burned through fresh episodes. The new shows fighting to hold viewers' attention in the first few weeks of the new season face a grim future if they have to leave the schedule for an extended period. Negotiations on a new writers' contract began in July and the two sides have remained far apart. They brought in a federal mediator this week to try to break the deadlock on the key issue of compensating writers for the reuse of their work in various digital formats. The studios have said union demands for higher residuals on DVDs and Internet downloads would stifle growth at a time of rising production costs, tighter profits and piracy. They insist digital distribution of movies and TV remains largely experimental or promotional and new media is just developing. The union accuses studios of pleading poverty and argues that writers have never had a fair deal on lucrative DVDs. They also see more film and TV migrating toward the Internet and wireless platforms and want a bigger share of that revenue. (Additional writing by Dean Goodman)"

Startover

 art_parg= data.frame(
  paragraph_text =unlist(tokenize_sentence(article))
) %>%
    rowid_to_column(var = "paragraph_num")

Extract_Named_Entities(art_parg) 
##    paragraph_num                                            Named.Entity
## 1              1                                           LOS ANGELES  
## 2              1                                                 Reuters
## 3              1                                                    U.S.
## 4              1                                                  Monday
## 5              1                                                     one
## 6              1                                            last weekend
## 7              1                                                20 years
## 8              1                                               Hollywood
## 9              2                                                  Friday
## 10             2                                                   a day
## 11             2                                              three-year
## 12             2                                           12,000-member
## 13             2                                Writers Guild of America
## 14             5                                                Thursday
## 15             5                                                   night
## 16             5                                     the Writers Guild's
## 17             6                                                     two
## 18             6                                                 10 a.m.
## 19             7                                                  Sunday
## 20             8                                               afternoon
## 21             8                         hundreds of millions of dollars
## 22             9                                                48 hours
## 23             9                                             John Bowman
## 24            10                                       the Writers Guild
## 25            11                                                  Bowman
## 26            12 The Alliance of Motion Picture and Television Producers
## 27            12                                            Nick Counter
## 28            12                                     the Writers Guild's
## 29            14                                             last-minute
## 30            14                                               3:01 a.m.
## 31            15                                                     EST
## 32            15                                             Los Angeles
## 33            15                                           New York City
## 34            16                                              $1 BILLION
## 35            16                                               Hollywood
## 36            16                                           Writers Guild
## 37            16                                                    1988
## 38            16                                                22 weeks
## 39            16                         the start of the fall TV season
## 40            16                               an estimated $500 million
## 41            17                                             Los Angeles
## 42            17                                              Jack Kyser
## 43            17                                     at least $1 billion
## 44            18                                            $220 million
## 45            18                                             three years
## 46            18                                           $24.4 billion
## 47            18                                                    U.S.
## 48            19                                               last year
## 49            19                                  PricewaterhouseCoopers
## 50            20                                                   first
## 51            21                                                    2008
## 52            22                                                 January
## 53            22                                                February
## 54            23                                              late-night
## 55            23                                                   daily
## 56            24                                                     CBS
## 57            24                                                Thursday
## 58            24                                         David Letterman
## 59            26                                     the first few weeks
## 60            26                                          the new season
## 61            27                                                    July
## 62            27                                                     two
## 63            28                                               this week
## 64            33                                            Dean Goodman
##       Label                                   NamedEntity
## 1       GPE                                    LOSANGELES
## 2       ORG                                       Reuters
## 3       GPE                                          U.S.
## 4      DATE                                        Monday
## 5  CARDINAL                                           one
## 6      DATE                                   lastweekend
## 7      DATE                                       20years
## 8       GPE                                     Hollywood
## 9      DATE                                        Friday
## 10     DATE                                          aday
## 11     DATE                                    three-year
## 12     DATE                                 12,000-member
## 13      ORG                         WritersGuildofAmerica
## 14     DATE                                      Thursday
## 15     TIME                                         night
## 16      ORG                                  WritersGuild
## 17 CARDINAL                                           two
## 18     TIME                                        10a.m.
## 19     DATE                                        Sunday
## 20     TIME                                     afternoon
## 21    MONEY                   hundredsofmillionsofdollars
## 22     TIME                                       48hours
## 23   PERSON                                    JohnBowman
## 24      ORG                                  WritersGuild
## 25   PERSON                                        Bowman
## 26      ORG AllianceofMotionPictureandTelevisionProducers
## 27   PERSON                                   NickCounter
## 28      ORG                                  WritersGuild
## 29     TIME                                   last-minute
## 30     TIME                                      3:01a.m.
## 31      ORG                                           EST
## 32      GPE                                    LosAngeles
## 33      GPE                                   NewYorkCity
## 34    MONEY                                     $1BILLION
## 35      GPE                                     Hollywood
## 36      ORG                                  WritersGuild
## 37     DATE                                          1988
## 38     DATE                                       22weeks
## 39     DATE                        startofthefallTVseason
## 40    MONEY                        anestimated$500million
## 41      GPE                                    LosAngeles
## 42   PERSON                                     JackKyser
## 43    MONEY                              atleast$1billion
## 44    MONEY                                   $220million
## 45     DATE                                    threeyears
## 46    MONEY                                  $24.4billion
## 47      GPE                                          U.S.
## 48     DATE                                      lastyear
## 49      ORG                        PricewaterhouseCoopers
## 50  ORDINAL                                         first
## 51     DATE                                          2008
## 52     DATE                                       January
## 53     DATE                                      February
## 54     TIME                                    late-night
## 55     DATE                                         daily
## 56      ORG                                           CBS
## 57     DATE                                      Thursday
## 58   PERSON                                DavidLetterman
## 59     DATE                                 firstfewweeks
## 60     DATE                                     newseason
## 61     DATE                                          July
## 62 CARDINAL                                           two
## 63     DATE                                      thisweek
## 64   PERSON                                   DeanGoodman
art_rm_NE = NE_Cleansing(art_parg, 'paragraph_num', 'paragraph_text', group = TRUE, rm=FALSE,    Extract_Named_Entities(art_parg) %>% filter(Label %in% c("GPE", "ORG", "PERSON","LOC",'NORP')) %>% select(-Label) %>% unique())

art_rm_NE
##    paragraph_num
## 1              1
## 2              2
## 3              3
## 4              4
## 5              5
## 6              6
## 7              7
## 8              8
## 9              9
## 10            10
## 11            11
## 12            12
## 13            13
## 14            14
## 15            15
## 16            16
## 17            17
## 18            18
## 19            19
## 20            20
## 21            21
## 22            22
## 23            23
## 24            24
## 25            25
## 26            26
## 27            27
## 28            28
## 29            29
## 30            30
## 31            31
## 32            32
## 33            33
##                                                                                                                                                                                                                                                paragraph_text
## 1  LOS ANGELES  (Reuters) - The union representing U.S. screenwriters called for a strike against film and TV studios starting on Monday in a move giving negotiators one last weekend to reach a contract deal or shatter 20 years of Hollywood labor peace.
## 2                                                                                                                    The strike deadline was issued on Friday, a day after a three-year contract covering the 12,000-member Writers Guild of America expired.
## 3                                                                                                                                       It follows months of talks that deadlocked over the union's demands for a greater share of DVD and Internet revenues.
## 4                                                                                                                                                          Each side has accused the other of stonewalling and refusing to budge from unreasonable proposals.
## 5                                                                                    Union negotiators urged a walkout during a boisterous membership meeting on Thursday night, and the Writers Guild's governing board voted to ratify that recommendation.
## 6                                                                                                                                                                      Hours later, a studio spokesman said the two sides had scheduled a meeting for 10 a.m.
## 7                                                                                                                                                                                                                                                     Sunday.
## 8                                                                        Union leaders said at an afternoon news conference there still was time to avoid a strike that, if prolonged, could cost hundreds of millions of dollars in lost revenues and wages.
## 9                                                                                                                             "We have 48 hours and what we really want to do is negotiate," said John Bowman, chairman of the union's negotiating committee.
## 10                                                                                                                                                             He said that while reluctant to go on strike, the Writers Guild felt it had to act decisively.
## 11                                                                                                                                                   "We have to inflict as much damage as quickly as possible in order to get this thing over," Bowman said.
## 12                  The Alliance of Motion Picture and Television Producers, the bargaining arm of the studios, offered a statement by the group's president, Nick Counter, calling the Writers Guild's move toward a strike "precipitous and irresponsible."
## 13                                                                                                                                          "Our goal continues to be to reach a fair and reasonable agreement that will keep the industry working," he said.
## 14                                                                                                                                                                  Union officials said that barring a last-minute deal, the strike would begin at 3:01 a.m.
## 15                                                                                                                                                                                         EST and picket lines would go up in Los Angeles and New York City.
## 16                                                      $1 BILLION AT RISK The last major Hollywood strike was a Writers Guild walkout in 1988 that lasted 22 weeks, delayed the start of the fall TV season and cost the industry an estimated $500 million.
## 17                                                                                                                            Los Angeles economist Jack Kyser said a strike of the same duration now could result in at least $1 billion in economic losses.
## 18                                                                                  The union says the overall compensation package sought by writers would cost $220 million over three years, a fraction of the $24.4 billion in revenues generated by U.S.
## 19                                                                                                                                                                DVD sales and rentals last year alone, according to accounting firm PricewaterhouseCoopers.
## 20                                                                                                                                                                              A writers' strike would be little noticed by movie and TV audiences at first.
## 21                                                                                                                                                                                            Film studios' screenplay pipeline is well-stocked through 2008.
## 22                                                                                               And producers of prime-time sitcoms and dramas are said to have stockpiled enough advance episodes to keep their shows on the air until January or February.
## 23                                                                                                                                       But late-night talk shows will go off the air almost immediately since they rely on a daily supply of topical jokes.
## 24                                                                                                                                                 On his CBS show on Thursday, David Letterman described the producers as "cowards, cutthroats and weasels."
## 25                                                                                                                          Prime-time schedules will start filling up with more reruns and game shows after the networks have burned through fresh episodes.
## 26                                                                                   The new shows fighting to hold viewers' attention in the first few weeks of the new season face a grim future if they have to leave the schedule for an extended period.
## 27                                                                                                                                                           Negotiations on a new writers' contract began in July and the two sides have remained far apart.
## 28                                                                                 They brought in a federal mediator this week to try to break the deadlock on the key issue of compensating writers for the reuse of their work in various digital formats.
## 29                                                                              The studios have said union demands for higher residuals on DVDs and Internet downloads would stifle growth at a time of rising production costs, tighter profits and piracy.
## 30                                                                                                                            They insist digital distribution of movies and TV remains largely experimental or promotional and new media is just developing.
## 31                                                                                                                                        The union accuses studios of pleading poverty and argues that writers have never had a fair deal on lucrative DVDs.
## 32                                                                                                                               They also see more film and TV migrating toward the Internet and wireless platforms and want a bigger share of that revenue.
## 33                                                                                                                                                                                                                       (Additional writing by Dean Goodman)
##                                                                                                                                                                                                                                                       TEXT
## 1  LOSANGELES(Reuters) - The union representing U.S. screenwriters called for a strike against film and TV studios starting on Monday in a move giving negotiators one last weekend to reach a contract deal or shatter 20 years of Hollywood labor peace.
## 2                                                                                                                    The strike deadline was issued on Friday, a day after a three-year contract covering the 12,000-member WritersGuildofAmerica expired.
## 3                                                                                                                                    It follows months of talks that deadlocked over the union's demands for a greater share of DVD and Internet revenues.
## 4                                                                                                                                                       Each side has accused the other of stonewalling and refusing to budge from unreasonable proposals.
## 5                                                                                        Union negotiators urged a walkout during a boisterous membership meeting on Thursday night, and WritersGuild governing board voted to ratify that recommendation.
## 6                                                                                                                                                                   Hours later, a studio spokesman said the two sides had scheduled a meeting for 10 a.m.
## 7                                                                                                                                                                                                                                                  Sunday.
## 8                                                                     Union leaders said at an afternoon news conference there still was time to avoid a strike that, if prolonged, could cost hundreds of millions of dollars in lost revenues and wages.
## 9                                                                                                                           "We have 48 hours and what we really want to do is negotiate," said JohnBowman, chairman of the union's negotiating committee.
## 10                                                                                                                                                               He said that while reluctant to go on strike, WritersGuild felt it had to act decisively.
## 11                                                                                                                                                "We have to inflict as much damage as quickly as possible in order to get this thing over," Bowman said.
## 12                                 AllianceofMotionPictureandTelevisionProducers, the bargaining arm of the studios, offered a statement by the group's president, NickCounter, calling WritersGuild move toward a strike "precipitous and irresponsible."
## 13                                                                                                                                       "Our goal continues to be to reach a fair and reasonable agreement that will keep the industry working," he said.
## 14                                                                                                                                                               Union officials said that barring a last-minute deal, the strike would begin at 3:01 a.m.
## 15                                                                                                                                                                                         EST and picket lines would go up in LosAngeles and NewYorkCity.
## 16                                                    $1 BILLION AT RISK The last major Hollywood strike was a WritersGuild walkout in 1988 that lasted 22 weeks, delayed the start of the fall TV season and cost the industry an estimated $500 million.
## 17                                                                                                                           LosAngeles economist JackKyser said a strike of the same duration now could result in at least $1 billion in economic losses.
## 18                                                                               The union says the overall compensation package sought by writers would cost $220 million over three years, a fraction of the $24.4 billion in revenues generated by U.S.
## 19                                                                                                                                                             DVD sales and rentals last year alone, according to accounting firm PricewaterhouseCoopers.
## 20                                                                                                                                                                           A writers' strike would be little noticed by movie and TV audiences at first.
## 21                                                                                                                                                                                         Film studios' screenplay pipeline is well-stocked through 2008.
## 22                                                                                            And producers of prime-time sitcoms and dramas are said to have stockpiled enough advance episodes to keep their shows on the air until January or February.
## 23                                                                                                                                    But late-night talk shows will go off the air almost immediately since they rely on a daily supply of topical jokes.
## 24                                                                                                                                               On his CBS show on Thursday, DavidLetterman described the producers as "cowards, cutthroats and weasels."
## 25                                                                                                                       Prime-time schedules will start filling up with more reruns and game shows after the networks have burned through fresh episodes.
## 26                                                                                The new shows fighting to hold viewers' attention in the first few weeks of the new season face a grim future if they have to leave the schedule for an extended period.
## 27                                                                                                                                                        Negotiations on a new writers' contract began in July and the two sides have remained far apart.
## 28                                                                              They brought in a federal mediator this week to try to break the deadlock on the key issue of compensating writers for the reuse of their work in various digital formats.
## 29                                                                           The studios have said union demands for higher residuals on DVDs and Internet downloads would stifle growth at a time of rising production costs, tighter profits and piracy.
## 30                                                                                                                         They insist digital distribution of movies and TV remains largely experimental or promotional and new media is just developing.
## 31                                                                                                                                     The union accuses studios of pleading poverty and argues that writers have never had a fair deal on lucrative DVDs.
## 32                                                                                                                            They also see more film and TV migrating toward the Internet and wireless platforms and want a bigger share of that revenue.
## 33                                                                                                                                                                                                                     (Additional writing by DeanGoodman)
df = art_rm_NE %>%
    select(paragraph_num, TEXT) %>%
    unnest_tokens(input = TEXT, output = word)  %>%
    mutate(word = str_remove_all(word,"'s$")) %>%
    mutate(word = str_remove_all(word,"^the"))%>%
    mutate(word = str_remove_all(word,"^The"))%>%
    mutate(word = textstem::lemmatize_words(word)) %>%
    mutate(word= tolower(word)) %>%
    filter(!str_detect(word, '^\\d')) %>%
    filter(!str_detect(word, '^\\d[a-z][a-z]')) %>%
    anti_join(stop_words) %>%
    filter(nchar(word) > 2) %>%
    purrr::set_names('id','word') %>%
    dplyr::count(id, word)
## Joining, by = "word"
df
##     id                                          word n
## 1    1                                          call 1
## 2    1                                      contract 1
## 3    1                                          deal 1
## 4    1                                          film 1
## 5    1                                     hollywood 1
## 6    1                                         labor 1
## 7    1                                    losangeles 1
## 8    1                                        monday 1
## 9    1                                          move 1
## 10   1                                    negotiator 1
## 11   1                                         peace 1
## 12   1                                         reach 1
## 13   1                                     represent 1
## 14   1                                       reuters 1
## 15   1                                  screenwriter 1
## 16   1                                       shatter 1
## 17   1                                         start 1
## 18   1                                        strike 1
## 19   1                                        studio 1
## 20   1                                           u.s 1
## 21   1                                         union 1
## 22   1                                       weekend 1
## 23   2                                      contract 1
## 24   2                                         cover 1
## 25   2                                           day 1
## 26   2                                      deadline 1
## 27   2                                        expire 1
## 28   2                                        friday 1
## 29   2                                         issue 1
## 30   2                                        strike 1
## 31   2                         writersguildofamerica 1
## 32   3                                      deadlock 1
## 33   3                                        demand 1
## 34   3                                           dvd 1
## 35   3                                        follow 1
## 36   3                                      internet 1
## 37   3                                         month 1
## 38   3                                       revenue 1
## 39   3                                         share 1
## 40   3                                          talk 1
## 41   3                                         union 1
## 42   4                                        accuse 1
## 43   4                                         budge 1
## 44   4                                      proposal 1
## 45   4                                        refuse 1
## 46   4                                     stonewall 1
## 47   4                                  unreasonable 1
## 48   5                                         board 1
## 49   5                                    boisterous 1
## 50   5                                        govern 1
## 51   5                                          meet 1
## 52   5                                    membership 1
## 53   5                                    negotiator 1
## 54   5                                         night 1
## 55   5                                        ratify 1
## 56   5                                recommendation 1
## 57   5                                      thursday 1
## 58   5                                         union 1
## 59   5                                          urge 1
## 60   5                                          vote 1
## 61   5                                       walkout 1
## 62   5                                  writersguild 1
## 63   6                                           a.m 1
## 64   6                                          hour 1
## 65   6                                          late 1
## 66   6                                          meet 1
## 67   6                                      schedule 1
## 68   6                                     spokesman 1
## 69   6                                        studio 1
## 70   7                                        sunday 1
## 71   8                                     afternoon 1
## 72   8                                         avoid 1
## 73   8                                    conference 1
## 74   8                                          cost 1
## 75   8                                        dollar 1
## 76   8                                       hundred 1
## 77   8                                        leader 1
## 78   8                                          lose 1
## 79   8                                       million 1
## 80   8                                          news 1
## 81   8                                       prolong 1
## 82   8                                       revenue 1
## 83   8                                        strike 1
## 84   8                                          time 1
## 85   8                                         union 1
## 86   8                                          wage 1
## 87   9                                      chairman 1
## 88   9                                     committee 1
## 89   9                                          hour 1
## 90   9                                    johnbowman 1
## 91   9                                     negotiate 2
## 92   9                                         union 1
## 93  10                                           act 1
## 94  10                                    decisively 1
## 95  10                                          feel 1
## 96  10                                     reluctant 1
## 97  10                                        strike 1
## 98  10                                  writersguild 1
## 99  11                                        bowman 1
## 100 11                                        damage 1
## 101 11                                       inflict 1
## 102 11                                       quickly 1
## 103 12 allianceofmotionpictureandtelevisionproducers 1
## 104 12                                           arm 1
## 105 12                                       bargain 1
## 106 12                                          call 1
## 107 12                                 irresponsible 1
## 108 12                                          move 1
## 109 12                                   nickcounter 1
## 110 12                                         offer 1
## 111 12                                   precipitous 1
## 112 12                                     president 1
## 113 12                                     statement 1
## 114 12                                        strike 1
## 115 12                                        studio 1
## 116 12                                  writersguild 1
## 117 13                                     agreement 1
## 118 13                                      continue 1
## 119 13                                          fair 1
## 120 13                                          goal 1
## 121 13                                      industry 1
## 122 13                                         reach 1
## 123 13                                    reasonable 1
## 124 14                                           a.m 1
## 125 14                                           bar 1
## 126 14                                         begin 1
## 127 14                                          deal 1
## 128 14                                        minute 1
## 129 14                                      official 1
## 130 14                                        strike 1
## 131 14                                         union 1
## 132 15                                           est 1
## 133 15                                          line 1
## 134 15                                    losangeles 1
## 135 15                                   newyorkcity 1
## 136 15                                        picket 1
## 137 16                                       billion 1
## 138 16                                          cost 1
## 139 16                                         delay 1
## 140 16                                      estimate 1
## 141 16                                          fall 1
## 142 16                                     hollywood 1
## 143 16                                      industry 1
## 144 16                                         major 1
## 145 16                                       million 1
## 146 16                                          risk 1
## 147 16                                        season 1
## 148 16                                         start 1
## 149 16                                        strike 1
## 150 16                                       walkout 1
## 151 16                                          week 1
## 152 16                                  writersguild 1
## 153 17                                       billion 1
## 154 17                                      duration 1
## 155 17                                      economic 1
## 156 17                                     economist 1
## 157 17                                     jackkyser 1
## 158 17                                    losangeles 1
## 159 17                                          loss 1
## 160 17                                        result 1
## 161 17                                        strike 1
## 162 18                                       billion 1
## 163 18                                  compensation 1
## 164 18                                          cost 1
## 165 18                                      fraction 1
## 166 18                                      generate 1
## 167 18                                       million 1
## 168 18                                       package 1
## 169 18                                       revenue 1
## 170 18                                          seek 1
## 171 18                                           u.s 1
## 172 18                                         union 1
## 173 18                                        writer 1
## 174 19                                        accord 1
## 175 19                                       account 1
## 176 19                                           dvd 1
## 177 19                                          firm 1
## 178 19                        pricewaterhousecoopers 1
## 179 19                                        rental 1
## 180 19                                          sale 1
## 181 20                                      audience 1
## 182 20                                         movie 1
## 183 20                                        notice 1
## 184 20                                        strike 1
## 185 20                                        writer 1
## 186 21                                          film 1
## 187 21                                      pipeline 1
## 188 21                                    screenplay 1
## 189 21                                         stock 1
## 190 21                                        studio 1
## 191 22                                       advance 1
## 192 22                                           air 1
## 193 22                                         drama 1
## 194 22                                       episode 1
## 195 22                                      february 1
## 196 22                                       january 1
## 197 22                                         prime 1
## 198 22                                      producer 1
## 199 22                                        sitcom 1
## 200 22                                     stockpile 1
## 201 22                                          time 1
## 202 23                                           air 1
## 203 23                                         daily 1
## 204 23                                   immediately 1
## 205 23                                          joke 1
## 206 23                                          late 1
## 207 23                                         night 1
## 208 23                                          rely 1
## 209 23                                        supply 1
## 210 23                                          talk 1
## 211 23                                       topical 1
## 212 24                                           cbs 1
## 213 24                                        coward 1
## 214 24                                    cutthroats 1
## 215 24                                davidletterman 1
## 216 24                                      describe 1
## 217 24                                      producer 1
## 218 24                                      thursday 1
## 219 24                                        weasel 1
## 220 25                                          burn 1
## 221 25                                       episode 1
## 222 25                                          fill 1
## 223 25                                         fresh 1
## 224 25                                          game 1
## 225 25                                       network 1
## 226 25                                         prime 1
## 227 25                                         rerun 1
## 228 25                                      schedule 1
## 229 25                                         start 1
## 230 25                                          time 1
## 231 26                                     attention 1
## 232 26                                        extend 1
## 233 26                                         fight 1
## 234 26                                        future 1
## 235 26                                          grim 1
## 236 26                                          hold 1
## 237 26                                         leave 1
## 238 26                                        period 1
## 239 26                                      schedule 1
## 240 26                                        season 1
## 241 26                                        viewer 1
## 242 26                                          week 1
## 243 27                                         begin 1
## 244 27                                      contract 1
## 245 27                                          july 1
## 246 27                                   negotiation 1
## 247 27                                        remain 1
## 248 27                                        writer 1
## 249 28                                         break 1
## 250 28                                         bring 1
## 251 28                                    compensate 1
## 252 28                                      deadlock 1
## 253 28                                       digital 1
## 254 28                                       federal 1
## 255 28                                        format 1
## 256 28                                         issue 1
## 257 28                                           key 1
## 258 28                                      mediator 1
## 259 28                                         reuse 1
## 260 28                                          week 1
## 261 28                                        writer 1
## 262 29                                          cost 1
## 263 29                                        demand 1
## 264 29                                      download 1
## 265 29                                          dvds 1
## 266 29                                        growth 1
## 267 29                                      internet 1
## 268 29                                        piracy 1
## 269 29                                    production 1
## 270 29                                        profit 1
## 271 29                                      residual 1
## 272 29                                          rise 1
## 273 29                                        stifle 1
## 274 29                                        studio 1
## 275 29                                         tight 1
## 276 29                                          time 1
## 277 29                                         union 1
## 278 30                                       develop 1
## 279 30                                       digital 1
## 280 30                                  distribution 1
## 281 30                                  experimental 1
## 282 30                                        insist 1
## 283 30                                        medium 1
## 284 30                                         movie 1
## 285 30                                   promotional 1
## 286 30                                        remain 1
## 287 31                                        accuse 1
## 288 31                                         argue 1
## 289 31                                          deal 1
## 290 31                                          dvds 1
## 291 31                                          fair 1
## 292 31                                     lucrative 1
## 293 31                                         plead 1
## 294 31                                       poverty 1
## 295 31                                        studio 1
## 296 31                                         union 1
## 297 31                                        writer 1
## 298 32                                          film 1
## 299 32                                      internet 1
## 300 32                                       migrate 1
## 301 32                                      platform 1
## 302 32                                       revenue 1
## 303 32                                         share 1
## 304 32                                      wireless 1
## 305 33                                    additional 1
## 306 33                                   deangoodman 1
## 307 33                                         write 1
df_lemma = df %>%
    mutate(word = str_remove_all(word,"'s$")) %>%
    mutate(word = str_remove_all(word,"^the"))%>%
    mutate(word = str_remove_all(word,"^The"))%>%
    mutate(word = textstem::lemmatize_words(word)) %>%
    mutate(word= tolower(word)) %>%
    anti_join(stop_words) %>%
    filter(nchar(word) > 2) %>%
    purrr::set_names('id','word','n') %>%
    dplyr::count(id, word)
## Joining, by = "word"
dtm = df_lemma %>%
  tidytext::cast_dtm(document=id, term=word, value=n)

dtm
## <<DocumentTermMatrix (documents: 33, terms: 218)>>
## Non-/sparse entries: 307/6887
## Sparsity           : 96%
## Maximal term length: 45
## Weighting          : term frequency (tf)
mod = LDA_optimal(dtm, 2, 10, 5)
mod$Plot
terms = terms(mod$min_perp$model, k=20) %>%
  as.data.frame() %>%
  gather( topic, word,`Topic 1`:`Topic 4`, factor_key=FALSE) %>%
  left_join(
    df
  ) %>%
  filter(!is.na(n)) %>%
  select(-id) %>%
  group_by(topic, word) %>%
  mutate(n = sum(n))%>%
  unique()
## Joining, by = "word"
terms(mod$stationary_prep$model, k=20) %>%
  as.data.frame() 
##                  Topic 1   Topic 2    Topic 3       Topic 4
## 1             negotiator    strike      union        studio
## 2                  reach      time     writer  writersguild
## 3                   talk      deal    revenue          film
## 4                  night     start       cost          call
## 5                    a.m      week    million      internet
## 6                   late     issue  hollywood          hour
## 7               schedule    season     demand        monday
## 8                    air     movie     accuse  screenwriter
## 9                  labor   episode   thursday           dvd
## 10               reuters     prime   industry         share
## 11                   u.s  producer        u.s         budge
## 12 writersguildofamerica      move    weekend      proposal
## 13                   dvd     peace   deadlock  unreasonable
## 14                 board represent     follow          meet
## 15                govern   shatter   internet    johnbowman
## 16                  meet     cover      month irresponsible
## 17            membership       day boisterous     statement
## 18        recommendation  deadline     ratify          fair
## 19                  vote    friday       urge           est
## 20             spokesman  deadlock    walkout   newyorkcity
##                                          Topic 5
## 1                                       contract
## 2                                     losangeles
## 3                                        billion
## 4                                          begin
## 5                                         remain
## 6                                        digital
## 7                                           move
## 8                                         expire
## 9                                        walkout
## 10                                        sunday
## 11                                       prolong
## 12                                      chairman
## 13                                     committee
## 14                                    decisively
## 15                                        bowman
## 16                                        damage
## 17                                       quickly
## 18 allianceofmotionpictureandtelevisionproducers
## 19                                           arm
## 20                                       bargain
wordcloud(words = terms$word, freq = terms$n, min.freq = 1,
                    max.words=40, random.order=FALSE, rot.per=0.1, 
                    ordered.colors=TRUE,
                     colors=brewer.pal(8, "Dark2")[factor(terms$topic)])

library(LDAvis)